Python 3.8.15 (default, Nov 24 2022, 15:19:38)
Type 'copyright', 'credits' or 'license' for more information
IPython 8.7.0 -- An enhanced Interactive Python. Type '?' for
help.
Summary of completed work:
Summary of pending tasks:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import umap.umap_ as umap
import re, os, json
from sentence_transformers import SentenceTransformer
from typing import Union, Dict, Sequence, Callable, Tuple, Generator
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, OPTICS
from hdbscan import HDBSCAN
from itertools import product
from tqdm import tqdm
from toolz import pipe
from copy import deepcopy
from collections import Counter, defaultdict
from itertools import chain, count
from functools import partial
from dap_aria_mapping.getters.openalex import get_openalex_entities
from exploration_utils import (
get_sample,
filter_entities,
embed,
run_clustering_generators,
make_subplot_embeddings,
make_dendrogram
)
np.random.seed(42)
openalex_entities = pipe(
get_openalex_entities(),
partial(get_sample, score_threshold=60, num_articles=500),
partial(filter_entities, min_freq=60, max_freq=95),
)
2022-12-08 18:05:56,811 - botocore.credentials - INFO - Found credentials in shared credentials file: ~/.aws/credentials
# Create embeddings
embeddings = pipe(
openalex_entities.values(),
lambda oa: chain(*oa),
set,
list,
partial(embed, model="paraphrase-MiniLM-L6-v2"),
)
# Define UMAP
params = [
["n_neighbors", [5]],
["min_dist", [0.05]],
["n_components", [2]],
]
keys, permuts = ([x[0] for x in params], list(product(*[x[1] for x in params])))
param_perms = [{k: v for k, v in zip(keys, perm)} for perm in permuts]
for perm in param_perms:
embeddings_2d = umap.UMAP(**perm).fit_transform(embeddings)
fig = plt.figure(figsize=(10, 10))
plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], s=1)
fig.suptitle(f"{perm}")
plt.show()
2022-12-08 18:06:28,222 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: paraphrase-MiniLM-L6-v2 2022-12-08 18:06:28,585 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device: cpu
Batches: 0%| | 0/37 [00:00<?, ?it/s]
cluster_configs = [
[
KMeans,
[
{"n_clusters": 3, "n_init": 2},
{"n_clusters": 2, "n_init": 2},
{"n_clusters": 2, "n_init": 2}
],
],
[
AgglomerativeClustering,
[
{"n_clusters": 3},
{"n_clusters": 2},
{"n_clusters": 2}
]
],
]
cluster_outputs_s, plot_dicts = run_clustering_generators(cluster_configs, embeddings)
fig, axis = plt.subplots(2, 3, figsize=(24, 16), dpi=200)
for idx, (cdict, cluster) in enumerate(plot_dicts):
_, lvl = divmod(idx,3)
make_subplot_embeddings(
embeddings=embeddings_2d,
clabels=[int(e) for e in cdict.values()],
axis=axis.flat[idx],
label=f"{cluster.__module__} {str(lvl)}",
s=4,
)
for output in cluster_outputs_s:
print(
"Silhouette score - {} clusters - {}: {}".format(
output["model"].__module__,
output["model"].get_params()["n_clusters"],
round(output["silhouette"],3)
)
)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) Silhouette score - sklearn.cluster._kmeans clusters - 3: 0.03400000184774399 Silhouette score - sklearn.cluster._kmeans clusters - 2: 0.026000000536441803 Silhouette score - sklearn.cluster._kmeans clusters - 2: 0.019999999552965164 Silhouette score - sklearn.cluster._agglomerative clusters - 3: 0.023000000044703484 Silhouette score - sklearn.cluster._agglomerative clusters - 2: 0.017999999225139618 Silhouette score - sklearn.cluster._agglomerative clusters - 2: 0.010999999940395355
cluster_configs = [
[KMeans, [{"n_clusters": [5, 10, 25, 50], "n_init": 5}]],
[AgglomerativeClustering, [{"n_clusters": [5, 10, 25, 50]}]],
[DBSCAN, [{"eps": [0.05, 0.1], "min_samples": [4, 8]}]],
[HDBSCAN, [{"min_cluster_size": [4, 8], "min_samples": [4, 8]}]]
]
cluster_outputs_f, plot_dicts = run_clustering_generators(cluster_configs, embeddings)
fig, axis = plt.subplots(4, 4, figsize=(40, 40), dpi=200)
for idx, (cdict, cluster) in enumerate(plot_dicts):
make_subplot_embeddings(
embeddings=embeddings_2d,
clabels=[int(e) for e in cdict.values()],
axis=axis.flat[idx],
label=f"{cluster.__module__}",
cmap="gist_ncar",
)
for output in cluster_outputs_f:
print(
"Silhouette score - {}: {}".format(
output["model"].__module__,
round(output["silhouette"],3)
)
)
Silhouette score - sklearn.cluster._kmeans: 0.03099999949336052 Silhouette score - sklearn.cluster._kmeans: 0.02500000037252903 Silhouette score - sklearn.cluster._kmeans: 0.029999999329447746 Silhouette score - sklearn.cluster._kmeans: 0.029999999329447746 Silhouette score - sklearn.cluster._agglomerative: 0.01600000075995922 Silhouette score - sklearn.cluster._agglomerative: 0.017000000923871994 Silhouette score - sklearn.cluster._agglomerative: 0.01600000075995922 Silhouette score - sklearn.cluster._agglomerative: 0.027000000700354576 Silhouette score - sklearn.cluster._dbscan: 0 Silhouette score - sklearn.cluster._dbscan: 0 Silhouette score - sklearn.cluster._dbscan: 0 Silhouette score - sklearn.cluster._dbscan: 0 Silhouette score - hdbscan.hdbscan_: 0.0010000000474974513 Silhouette score - hdbscan.hdbscan_: -0.07400000095367432 Silhouette score - hdbscan.hdbscan_: -0.03500000014901161 Silhouette score - hdbscan.hdbscan_: -0.06700000166893005
cluster_configs = [[AgglomerativeClustering,[{"n_clusters": 100}]]]
cluster_outputs_d, plot_dicts = run_clustering_generators(cluster_configs, embeddings)
dendrogram = make_dendrogram(cluster_dict=cluster_outputs_d["labels"], model=cluster_outputs_d["model"])
fig, axis = plt.subplots(2, 3, figsize=(24, 16), dpi=200)
for i, (split, ax) in enumerate(zip(dendrogram[-7:-1], axis.flat)):
tags = [[tag for tag in cluster_tags if tag < embeddings.shape[0]] for cluster_tags in split.values()]
level_clust = list(chain(*[list(zip(tags[cluster],[cluster]*len(tags[cluster]))) for cluster in range(len(tags))]))
level_clust = sorted(level_clust, key=lambda x: x[0])
label_clust = [x[1] for x in level_clust]
ax.scatter(embeddings_2d[:,0], embeddings_2d[:,1], c=label_clust, s=4)
ax.set_title("Level {}".format(len(axis.flat)-i))
silhouette = silhouette_score(embeddings, label_clust)
print("Silhouette score - dendrogram level {}: {}".format(len(axis.flat)-i, round(silhouette, 3)))
plt.show()
Silhouette score - dendrogram level 6: 0.014000000432133675 Silhouette score - dendrogram level 5: 0.014000000432133675 Silhouette score - dendrogram level 4: 0.01600000075995922 Silhouette score - dendrogram level 3: 0.019999999552965164 Silhouette score - dendrogram level 2: 0.023000000044703484 Silhouette score - dendrogram level 1: 0.039000000804662704
cluster_configs = [
[
KMeans,
[
{"n_clusters": 400, "n_init": 2, "centroids": False},
{"n_clusters": 100, "n_init": 2, "centroids": True},
{"n_clusters": 5, "n_init": 2, "centroids": True},
],
],
]
cluster_outputs_c, plot_dicts = run_clustering_generators(cluster_configs, embeddings, embeddings_2d=embeddings_2d)
fig, axis = plt.subplots(1, 3, figsize=(24, 8), dpi=200)
for idx, cdict in enumerate(cluster_outputs_c):
if cdict["centroid_params"] is None:
axis[idx].scatter(
embeddings_2d[:, 0],
embeddings_2d[:, 1],
c=[e for e in cdict["labels"].values()],
s=1
)
else:
axis[idx].scatter(
cdict["centroid_params"]["n_embeddings_2d"][:, 0],
cdict["centroid_params"]["n_embeddings_2d"][:, 1],
c=cdict["model"].labels_,
s=cdict["centroid_params"]["sizes"]
)
print(f"Silhouette score ({idx}): {round(cdict['silhouette'], 3)}")
Silhouette score (0): 0.07100000232458115 Silhouette score (1): 0.0010000000474974513 Silhouette score (2): 0.006000000052154064
import altair as alt
def make_analysis_dicts(cluster_outputs, model_module_name):
return [
[
{k: "_".join(map(str, v)) for k, v in dictionary["labels"].items()},
dictionary["model"]
]
for dictionary in cluster_outputs
if dictionary["model"].__module__ == model_module_name
]
def make_analysis_dicts_alt(cluster_outputs, model_module_name):
return [
[
{k: v[-1] for k, v in dictionary["labels"].items()},
dictionary["model"]
]
for dictionary in cluster_outputs
if dictionary["model"].__module__ == model_module_name
]
def make_analysis_dataframe(analysis_dicts, coltype=""):
return pd.concat(
[
pd.DataFrame.from_dict(
dictionary[0],
orient='index',
columns=["{}_{}_{}".format(dictionary[1].__module__.replace(".",""),idx, coltype)]
)
for idx, dictionary in enumerate(analysis_dicts)
],
axis=1
)
def make_plots(analysis_df):
return alt.concat(
*[
(
alt
.Chart(analysis_df)
.mark_bar()
.encode(
x=alt.X(
f'{column}:O',
sort=alt.EncodingSortField(
field=f"{column}",
op="count",
order='descending'
)
),
y='count()'
)
)
for column in analysis_df.columns
]
)
make_plots(
make_analysis_dataframe(
make_analysis_dicts(cluster_outputs_s, "sklearn.cluster._kmeans")
)
)
/home/ampudia19/anaconda3/envs/dap_aria_mapping/lib/python3.8/site-packages/altair/utils/core.py:317: FutureWarning: iteritems is deprecated and will be removed in a future version. Use .items instead. for col_name, dtype in df.dtypes.iteritems():
make_plots(
make_analysis_dataframe(
make_analysis_dicts(cluster_outputs_s, "sklearn.cluster._agglomerative")
)
)
/home/ampudia19/anaconda3/envs/dap_aria_mapping/lib/python3.8/site-packages/altair/utils/core.py:317: FutureWarning: iteritems is deprecated and will be removed in a future version. Use .items instead. for col_name, dtype in df.dtypes.iteritems():
make_plots(
make_analysis_dataframe(
make_analysis_dicts(cluster_outputs_f, "sklearn.cluster._kmeans")
)
)
/home/ampudia19/anaconda3/envs/dap_aria_mapping/lib/python3.8/site-packages/altair/utils/core.py:317: FutureWarning: iteritems is deprecated and will be removed in a future version. Use .items instead. for col_name, dtype in df.dtypes.iteritems():
ls = []
for i, tree_split in enumerate(dendrogram[-7:-1]):
tags = [[tag for tag in cluster_tags if tag < embeddings.shape[0]] for cluster_tags in tree_split.values()]
level_clust = list(chain(*[list(zip(tags[cluster],[cluster]*len(tags[cluster]))) for cluster in range(len(tags))]))
level_clust = sorted(level_clust, key=lambda x: x[0])
label_clust = [x[1] for x in level_clust]
ls.append(pd.DataFrame({"tree_lvl_{}".format(6-i): label_clust}, index=embeddings.index))
analysis_df_d = pd.concat(ls, axis=1)
make_plots(analysis_df_d)
/home/ampudia19/anaconda3/envs/dap_aria_mapping/lib/python3.8/site-packages/altair/utils/core.py:317: FutureWarning: iteritems is deprecated and will be removed in a future version. Use .items instead. for col_name, dtype in df.dtypes.iteritems():
make_plots(
make_analysis_dataframe(
make_analysis_dicts_alt(cluster_outputs_c, "sklearn.cluster._kmeans")
)
)
/home/ampudia19/anaconda3/envs/dap_aria_mapping/lib/python3.8/site-packages/altair/utils/core.py:317: FutureWarning: iteritems is deprecated and will be removed in a future version. Use .items instead. for col_name, dtype in df.dtypes.iteritems():
meta_cluster_df = [
make_analysis_dataframe( # strict kmeans
make_analysis_dicts_alt(cluster_outputs_s, "sklearn.cluster._kmeans"), "s"
),
make_analysis_dataframe( # strict agglomerative
make_analysis_dicts_alt(cluster_outputs_s, "sklearn.cluster._agglomerative"), "s"
),
make_analysis_dataframe( # f kmeans
make_analysis_dicts(cluster_outputs_f, "sklearn.cluster._kmeans"), "f"
),
make_analysis_dataframe( # f agglomerative
make_analysis_dicts(cluster_outputs_f, "sklearn.cluster._agglomerative"), "f"
),
analysis_df_d,
make_analysis_dataframe(
make_analysis_dicts_alt(cluster_outputs_c, "sklearn.cluster._kmeans"), "c"
)
]
cooccur_dict = {idx: {id: 0 for id in meta_cluster_df[0].index} for idx in meta_cluster_df[0].index}
for df in meta_cluster_df:
df.index = df.index.set_names(["tag"])
df.reset_index(inplace=True)
for col in [x for x in df.columns if x!="tag"]:
df_tmp = df[[col, "tag"]]
df_tmp = df_tmp.merge(df_tmp, on=col).reset_index()
di = df_tmp.groupby("tag_x")["tag_y"].apply(list).to_dict()
for k,v in di.items():
for m in v:
cooccur_dict[k][m]+=1
cooccur_df = pd.DataFrame.from_dict(cooccur_dict)
cooccur_df.to_parquet("cooccur_df.parquet")
cooccur_df.head(10)
| Autism | Beta-lactam | Volatile organic compound | Region of interest | Mineral | Genomic library | Sobel operator | Semantic Web | Asthma | List of potato diseases | ... | Sparring | Multi-core processor | Determinant | Extreme learning machine | Scanning electron microscope | Cancer | Immunoglobulin M | Graphical model | Hardware description language | Scatter plot | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Autism | 23 | 10 | 6 | 2 | 6 | 8 | 2 | 2 | 17 | 6 | ... | 2 | 2 | 1 | 0 | 3 | 13 | 13 | 1 | 2 | 3 |
| Beta-lactam | 10 | 23 | 10 | 3 | 10 | 7 | 3 | 1 | 11 | 8 | ... | 5 | 1 | 4 | 3 | 2 | 15 | 15 | 2 | 1 | 0 |
| Volatile organic compound | 6 | 10 | 23 | 2 | 22 | 11 | 2 | 2 | 7 | 13 | ... | 4 | 2 | 3 | 4 | 3 | 8 | 8 | 3 | 2 | 1 |
| Region of interest | 2 | 3 | 2 | 23 | 2 | 2 | 14 | 2 | 3 | 2 | ... | 4 | 4 | 16 | 4 | 5 | 2 | 2 | 3 | 4 | 7 |
| Mineral | 6 | 10 | 22 | 2 | 23 | 11 | 2 | 2 | 7 | 13 | ... | 4 | 2 | 3 | 4 | 3 | 8 | 8 | 3 | 2 | 1 |
| Genomic library | 8 | 7 | 11 | 2 | 11 | 23 | 2 | 2 | 7 | 10 | ... | 2 | 2 | 1 | 2 | 3 | 6 | 6 | 1 | 2 | 3 |
| Sobel operator | 2 | 3 | 2 | 14 | 2 | 2 | 23 | 4 | 1 | 2 | ... | 4 | 2 | 17 | 4 | 7 | 2 | 2 | 3 | 2 | 7 |
| Semantic Web | 2 | 1 | 2 | 2 | 2 | 2 | 4 | 23 | 1 | 2 | ... | 6 | 10 | 3 | 17 | 5 | 2 | 2 | 17 | 10 | 5 |
| Asthma | 17 | 11 | 7 | 3 | 7 | 7 | 1 | 1 | 23 | 7 | ... | 3 | 3 | 2 | 1 | 2 | 14 | 15 | 2 | 3 | 2 |
| List of potato diseases | 6 | 8 | 13 | 2 | 13 | 10 | 2 | 2 | 7 | 23 | ... | 3 | 2 | 3 | 4 | 2 | 8 | 8 | 3 | 2 | 1 |
10 rows × 1161 columns